import requests
from bs4 import BeautifulSoup
import csv
import openpyxl

# 设置请求头，防止被网站拦截  93
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/122.0.0.0 Safari/537.36"
}

url = "https://gaokao.eol.cn/e_html/gk/2022/985yx/index.html"
resp = requests.get(url, headers=headers, timeout=10)
resp.encoding = resp.apparent_encoding
html = resp.text

soup = BeautifulSoup(html, "html.parser")
table = soup.find("table")

data = []
current_region = None
trs = table.find_all("tr")
for tr in trs:
    tds = tr.find_all("td")
    if not tds:
        continue

    # 跳过表头
    if "地区" in tds[0].get_text():
        continue

    if len(tds) == 3:
        region = tds[0].get_text(strip=True)
        current_region = region
        school = tds[1].get_text(strip=True)
        subjects = tds[2].get_text(strip=True)
    elif len(tds) == 2:
        region = current_region
        school = tds[0].get_text(strip=True)
        subjects = tds[1].get_text(strip=True)
    else:
        continue

    data.append([region, school, subjects])

# -------- 保存 CSV --------
csv_file = "985_universities.csv"
with open(csv_file, "w", newline="", encoding="utf-8-sig") as f:
    writer = csv.writer(f)
    writer.writerow(["地区", "学校名称", "一流学科建设名单"])
    writer.writerows(data)

# -------- 保存 Excel --------
excel_file = "985_universities.xlsx"
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "985高校一流学科"

ws.append(["地区", "学校名称", "一流学科建设名单"])
for row in data:
    ws.append(row)

wb.save(excel_file)

print(f"✅ 数据已保存为 {csv_file} 和 {excel_file}")
